Here, we are trying to explain and demonstrate how to use the ProAli to align the goldfish genome assembly against the zebrafish reference genome.
Firstly, we prepare the input files, the zebrafish reference genomes in fasta format, the zebrafish reference genome annotation in gff(3) format, and the goldfish query genome file in fasta format.
wget http://ftp.ensembl.org/pub/release-91/fasta/danio_rerio/dna/Danio_rerio.GRCz10.dna.toplevel.fa.gz
wget http://ftp.ensembl.org/pub/release-91/gff3/danio_rerio/Danio_rerio.GRCz10.91.chr.gff3.gz
wget https://research.nhgri.nih.gov/goldfish/download/carAur01.sm.fa
gunzip *gz
Genome masking would not improve the performance of AnchorWave.
AnchorWave do not ultilize any soft masking information. Hard masking would increase the computational cost of AnchorWave.
Since the ProAli use collinearity blocks and whole genome duplications to guide the genome alignment. Before running the alignment, we use minimap2 to map reference full-length CDS to query genome, visualize the mapping and get ideas about the collinearity and whole-genome duplication.
firstly I indexed the genome files using samtools faidx command
samtools faidx carAur01.sm.fa
samtools faidx Danio_rerio.GRCz11.dna.primary_assembly.fa
Then we check the name and sequence length for each sequence record manually.
Then we roughly know they are chromosome level genome assemblies(not contig level).
To check collinearity and whole-genome duplication, we only need to focus on chromosomes.
They are "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" for “Branchiostoma_lanceolatum.BraLan2.dna.toplevel.fa”
and "LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29", "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39", "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50" carAur01.sm.fa
since other contigs maight be unclasped heterozygous regions, we only keep chrosomes
head -15923983 carAur01.sm.fa > goldfish.fa
NOTE: please do NOT use CDS extracted using other software. Since AnchorWave filtered some CDS records to minimum the side effect of minimap2 limitation that “Minimap2 often misses small exons” (https://github.com/lh3/minimap2#limitations)
anchorwave gff2seq -r Danio_rerio.GRCz11.dna.primary_assembly.fa -i Danio_rerio.GRCz11.102.chr.gff3 -o cds.fa
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 Danio_rerio.GRCz11.dna.primary_assembly.fa cds.fa > ref.sam
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 goldfish.fa cds.fa > carAur.sam
perl alignmentToDotplot.pl Danio_rerio.GRCz11.102.chr.gff3 carAur.sam > carAur.tab
library(ggplot2)
library(compiler)
enableJIT(3)
## [1] 3
library(ggplot2)
library("Cairo")
changetoM <- function ( position ){
position=position/1000000;
paste(position, "M", sep="")
}
data =read.table("carAur.tab")
data = data[which(data$V1 %in% c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" )),]
data = data[which(data$V3 %in% c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29", "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39", "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50")),]
data$V1 = factor(data$V1, levels=c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" ))
data$V3 = factor(data$V3, levels=c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29", "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39", "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50" ))
p = ggplot(data=data, aes(x=V4, y=V2))+geom_point(size=1, aes(color=V5))+facet_grid(V1~V3, scales="free", space="free" )+ theme_grey(base_size = 60) +
labs(x="goldfish", y="zebrafish")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
theme(axis.line = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
axis.text.y = element_text( colour = "black"),
legend.position='none',
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="carAur.png",width = 12000, height = 7000)
p
dev.off()
## png
## 2
Considering the genome duplication and translocation variations, we use the proali function to identify collinear region.
We expect the genome alignment coverage on reference genome (zebrafish) equal with or smaller than 2. We expect the genome alignment coverage on query genome (goldsifh) equal with or smaller than 1. So we set parameter -R 2 -Q 1 As this stage we would like to identify collinear anchors and do not perform base-pair resolution sequence alignment. So we only set -n to output collinear anchors. By setting -ns , we do not would like to identify novel anchors.
anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa -a carAur.sam -as cds.fa -ar ref.sam -s goldfish.fa -n align1.anchors -R 2 -Q 1 -ns
library(ggplot2)
library(compiler)
enableJIT(3)
## [1] 3
library(ggplot2)
library("Cairo")
changetoM <- function ( position ){
position=position/1000000;
paste(position, "M", sep="")
}
data =read.table("align1.anchors", head=TRUE)
data = data[which(data$refChr %in% c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" )),]
data = data[which(data$queryChr %in% c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29", "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39", "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50")),]
data$refChr = factor(data$refChr, levels=c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" ))
data$queryChr = factor(data$queryChr, levels=c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29", "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39", "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50" ))
p = ggplot(data=data, aes(x=queryStart, y=referenceStart))+geom_point(size=1, aes(color=strand))+facet_grid(refChr~queryChr, scales="free", space="free" )+ theme_grey(base_size = 60) +
labs(x="goldfish", y="zebrafish")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
theme(axis.line = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
axis.text.y = element_text( colour = "black"),
legend.position='none',
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="align1.anchors.png",width = 12000, height = 7000)
p
dev.off()
## png
## 2
Here we set scoring parameters -B -O1 -E1 -O2, to compare the alignment result of using different scoring parameters.
Using the wavefront alignment (WFA) algorithm, the match score is constantly set as 0. Empirically, we always set -E2 as -1.
We set -w, and -fa3 to minimum the usage of sliding window alignment approach and novel anchors. Those parameter would improve the alignment while cost memory than the default parameters.
With the following setting, each thread could cost as high as 50 Gb memory, and we ran this command on a computer with 512Gb memory available with 10 threads.
/usr/bin/time ./anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa \
-a carAur.sam -as cds.fa -ar ref.sam -s goldfish.fa -n align2.anchors -o align2.maf -t 7 -R 2 -Q 1 -B -4 -O1 -4 -E1 -2 -O2 -80 -E2 -1 \
-f align2.f.maf -w 38000 -fa3 200000 > fishlog2 2>&1
Supplemental Table 5. List of 436,035 representative ATAC-seq peak.txt is from https://www.nature.com/articles/s41586-020-2962-9
and rename it to cre.bed
maf-convert sam align2.maf | sed 's/Danio_rerio.GRCz11.dna.primary_assembly.fa.//g' | sed 's/goldfish.fa.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > align.bam
samtools depth align.bam | wc -l #1145404904
samtools depth align.bam | awk '$3>0{print $0}' | wc -l # 313676584
sed -i -E 's/^chr//g' cre.bed
samtools depth -b cre.bed align.bam | wc -l #191251221
samtools depth -b cre.bed align.bam | awk '$3>0{print $0}' | wc -l #53884567
minimap2 -x asm5 -t 20 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa > minimap2_goldfish_5.sam
minimap2 -x asm10 -t 20 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa > minimap2_goldfish_10.sam
minimap2 -x asm20 -t 20 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa > minimap2_goldfish_20.sam
samtools sort minimap2_goldfish_5.sam > minimap2_goldfish_5.bam
samtools sort minimap2_goldfish_10.sam > minimap2_goldfish_10.bam
samtools sort minimap2_goldfish_20.sam > minimap2_goldfish_20.bam
samtools depth minimap2_goldfish_5.bam | wc -l #1350589
samtools depth minimap2_goldfish_10.bam | wc -l #6407183
samtools depth minimap2_goldfish_20.bam | wc -l #35881644
samtools depth minimap2_goldfish_5.bam | awk '$3>0{print $0}' | wc -l # 1343192
samtools depth minimap2_goldfish_10.bam | awk '$3>0{print $0}' | wc -l # 6329847
samtools depth minimap2_goldfish_20.bam | awk '$3>0{print $0}' | wc -l # 35002774
lastdb -P0 zebrafish Danio_rerio.GRCz11.dna.primary_assembly.fa
faToTwoBit Danio_rerio.GRCz11.dna.primary_assembly.fa zebrafish.2bit
faSize -detailed Danio_rerio.GRCz11.dna.primary_assembly.fa > zebrafish.size
lastal zebrafish goldfish.fa > goldfish_lastal.maf
faSize -detailed goldfish.fa > goldfish.size
faToTwoBit goldfish.fa goldfish.2bit
python2 /programs/last-932/bin/maf-convert psl goldfish_lastal.maf > goldfish_lastal.psl
axtChain -linearGap=loose -psl goldfish_lastal.psl -faQ -faT Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa goldfish_lastal.chain
chainMergeSort goldfish_lastal.chain > goldfish_lastal.all.chain
chainPreNet goldfish_lastal.all.chain zebrafish.size goldfish.size goldfish_lastal.preNet
chainNet goldfish_lastal.preNet zebrafish.size goldfish.size goldfish_lastal.refTarget.net goldfish_lastal.chainNet
netToAxt goldfish_lastal.refTarget.net goldfish_lastal.preNet zebrafish.2bit goldfish.2bit stdout | axtSort stdin goldfish_lastal.axt
axtToMaf goldfish_lastal.axt zebrafish.size goldfish.size goldfish_lastal_final.maf -qPrefix=query. -tPrefix=col.
perl lastFinalToSplit.pl goldfish_lastal_final.maf > goldfish_lastal_final_forsplit.maf
cat goldfish_lastal_final_forsplit.maf | last-split > goldfish_lastal_final_split.maf
maf-convert sam goldfish_lastal.maf| samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_lastal.bam #many-to-many
maf-convert sam goldfish_lastal_final.maf| sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_lastal_final.bam # many to one
maf-convert sam goldfish_lastal_final_split.maf | sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_lastal_final_split.bam
samtools index goldfish_lastal.bam
samtools index goldfish_lastal_final.bam
samtools index goldfish_lastal_split.bam
samtools index goldfish_lastal_final_split.bam
samtools depth goldfish_lastal.bam | wc -l #628324746
samtools depth goldfish_lastal_split.bam | wc -l #210589780
samtools depth goldfish_lastal_final.bam | wc -l #556518096
samtools depth goldfish_lastal_final_split.bam | wc -l #252695645
samtools depth goldfish_lastal.bam | awk '$3>0{print $0}' | wc -l # 624893762
samtools depth goldfish_lastal_split.bam | awk '$3>0{print $0}' | wc -l # 207926386
samtools depth goldfish_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 547064597
samtools depth goldfish_lastal_final_split.bam | awk '$3>0{print $0}'| wc -l # 248353893
nucmer -t 75 --sam-long=mumer.goldfish.short.sam Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa
cat mumer.goldfish.short.sam | grep -v "@" | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > mumer.goldfish.short.bam
samtools index mumer.goldfish.short.bam
samtools depth mumer.goldfish.short.bam | wc -l # 94867701
samtools depth mumer.goldfish.short.bam | awk '$3>0{print $0}' | wc -l # 93794614
GSAlign -r Danio_rerio.GRCz11.dna.primary_assembly.fa -q goldfish.fa -t 18 -o goldfish_gsalign -fmt 1
python2 /programs/last-932/bin/maf-convert sam goldfish_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_gsalign.bam
samtools index goldfish_gsalign.bam
samtools depth goldfish_gsalign.bam | wc -l # 20059298
samtools depth goldfish_gsalign.bam | awk '$3>0{print $0}' | wc -l # 19574261
genomesize = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
genomesize = sum(genomesize$V2)
library(ggplot2)
dat = read.table("goldFishSummary", sep="\t", header=TRUE)
dat$category = factor(dat$category, levels=c("unaligned", "gap", "position match"))
dat$approach <- factor(dat$approach, levels = c("AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20", "LAST+many-to-many", "LAST+many-to-one", "LAST+one-to-one", "MUMmer4", "GSAlign"))
dat$proportion = dat$proportion/genomesize
p = ggplot(data=dat, aes(x=approach, y=proportion, fill=category)) + geom_bar(stat="identity") + scale_fill_manual(values=c("#54AEE1", "#92A000", "#EF8600")) + guides(fill=guide_legend(nrow=1,byrow=TRUE))+
labs(x="", y="Genome proportion", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
legend.position = "top",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "genome_aligned"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
d = read.table("goldfish.fa.fai")
sum(d$V2)/1000/1000/1000 # 1.273912
## [1] 1.273912
d = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
d = head(d, 25)
sum(d$V2)/1000/1000/1000 # 1.345102
## [1] 1.345102
Then we seperated two subgenomes
seqkit grep -n -p LG1 goldfish.fa > goldfish1.fa
seqkit grep -n -p LG2 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG3 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG4 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG5 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG6 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG7 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG8 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG9 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG10 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG11 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG12 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG13 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG14 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG15 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG16 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG17 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG18 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG19 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG20 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG21 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG22 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG23 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG24 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG25 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG26 goldfish.fa > goldfish2.fa
seqkit grep -n -p LG27 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG28 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG28B goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG29 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG30 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG30F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG31 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG32 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG33 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG34 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG35 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG36 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG36F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG37 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG37M goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG38 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG39 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG40 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG41 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG42 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG42F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG43 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG44 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG44F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG45 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG45M goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG46 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG47 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG48 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG48F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG49 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG49B goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG50 goldfish.fa >> goldfish2.fa
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 goldfish1.fa cds.fa > carAur1.sam
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 goldfish2.fa cds.fa > carAur2.sam
/usr/bin/time -v anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa -a carAur1.sam -as cds.fa -ar ref.sam -s goldfish1.fa -n alignh1.anchors -o alignh1.maf -R 1 -Q 1 -f alignh1.f.maf -B -4 -O1 -4 -E1 -2 -O2 -80 -E2 -1 -w 38000 -fa3 200000 > fishlogh1 2>&1
/usr/bin/time -v anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa -a carAur2.sam -as cds.fa -ar ref.sam -s goldfish2.fa -n alignh2.anchors -o alignh2.maf -R 1 -Q 1 -f alignh2.f.maf -w 120000 -fa 80000 -fa2 120000 -fa3 80000 > fishlogh2 2>&1
maf-convert sam alignh1.maf | sed 's/Danio_rerio.GRCz11.dna.primary_assembly.fa.//g' | sed 's/goldfish1.fa.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > alignh1.bam
samtools depth alignh1.bam | wc -l #1103736001
samtools depth alignh1.bam | awk '$3>0{print $0}' | wc -l # 427549895
maf-convert sam alignh2.maf | sed 's/Danio_rerio.GRCz11.dna.primary_assembly.fa.//g' | sed 's/goldfish2.fa.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > alignh2.bam
samtools depth alignh2.bam | wc -l #949917011
samtools depth alignh2.bam | awk '$3>0{print $0}' | wc -l # 352417508
samtools merge alignh_seperate.sam alignh1.bam alignh2.bam
samtools depth alignh_seperate.sam | wc -l #1256261085
samtools depth alignh_seperate.sam | awk '$3>0{print $0}' | wc -l # 629358172
minimap2 -x asm5 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa > minimap2_goldfish1_5.sam
minimap2 -x asm10 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa > minimap2_goldfish1_10.sam
minimap2 -x asm20 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa > minimap2_goldfish1_20.sam
minimap2 -x asm5 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa > minimap2_goldfish2_5.sam
minimap2 -x asm10 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa > minimap2_goldfish2_10.sam
minimap2 -x asm20 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa > minimap2_goldfish2_20.sam
samtools sort minimap2_goldfish1_5.sam > minimap2_goldfish1_5.bam
samtools sort minimap2_goldfish1_10.sam > minimap2_goldfish1_10.bam
samtools sort minimap2_goldfish1_20.sam > minimap2_goldfish1_20.bam
samtools depth minimap2_goldfish1_5.bam | wc -l # 1091486
samtools depth minimap2_goldfish1_10.bam | wc -l # 5157696
samtools depth minimap2_goldfish1_20.bam | wc -l # 29328585
samtools depth minimap2_goldfish1_5.bam | awk '$3>0{print $0}' | wc -l # 1085063
samtools depth minimap2_goldfish1_10.bam | awk '$3>0{print $0}' | wc -l # 5090084
samtools depth minimap2_goldfish1_20.bam | awk '$3>0{print $0}' | wc -l # 28551676
samtools sort minimap2_goldfish2_5.sam > minimap2_goldfish2_5.bam
samtools sort minimap2_goldfish2_10.sam > minimap2_goldfish2_10.bam
samtools sort minimap2_goldfish2_20.sam > minimap2_goldfish2_20.bam
samtools depth minimap2_goldfish2_5.bam | wc -l # 571183
samtools depth minimap2_goldfish2_10.bam | wc -l # 3344970
samtools depth minimap2_goldfish2_20.bam | wc -l # 21840870
samtools depth minimap2_goldfish2_5.bam | awk '$3>0{print $0}' | wc -l # 567843
samtools depth minimap2_goldfish2_10.bam | awk '$3>0{print $0}' | wc -l # 3297946
samtools depth minimap2_goldfish2_20.bam | awk '$3>0{print $0}' | wc -l # 21215193
samtools merge minimap2_goldfish_5_seperate.bam minimap2_goldfish1_5.sam minimap2_goldfish2_5.sam
samtools merge minimap2_goldfish_10_seperate.bam minimap2_goldfish1_10.sam minimap2_goldfish2_10.sam
samtools merge minimap2_goldfish_20_seperate.bam minimap2_goldfish1_20.sam minimap2_goldfish2_20.sam
samtools sort minimap2_goldfish_5_seperate.bam > minimap2_goldfish_5_seperate_sort.bam
samtools sort minimap2_goldfish_10_seperate.bam > minimap2_goldfish_10_seperate_sort.bam
samtools sort minimap2_goldfish_20_seperate.bam > minimap2_goldfish_20_seperate_sort.bam
samtools depth minimap2_goldfish_5_seperate_sort.bam | wc -l # 1350589
samtools depth minimap2_goldfish_10_seperate_sort.bam | wc -l # 6407183
samtools depth minimap2_goldfish_20_seperate_sort.bam | wc -l # 35881644
samtools depth minimap2_goldfish_5_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 1343192
samtools depth minimap2_goldfish_10_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 6329847
samtools depth minimap2_goldfish_20_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 35002774
lastdb -P0 zebrafish Danio_rerio.GRCz11.dna.primary_assembly.fa
faToTwoBit Danio_rerio.GRCz11.dna.primary_assembly.fa zebrafish.2bit
faSize -detailed Danio_rerio.GRCz11.dna.primary_assembly.fa > zebrafish.size
lastal zebrafish goldfish1.fa > goldfish1_lastal.maf
faSize -detailed goldfish1.fa > goldfish1.size
faToTwoBit goldfish1.fa goldfish1.2bit
python2 /programs/last-932/bin/maf-convert psl goldfish1_lastal.maf > goldfish1_lastal.psl
axtChain -linearGap=loose -psl goldfish1_lastal.psl -faQ -faT Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa goldfish1_lastal.chain
chainMergeSort goldfish1_lastal.chain > goldfish1_lastal.all.chain
chainPreNet goldfish1_lastal.all.chain zebrafish.size goldfish1.size goldfish1_lastal.preNet
chainNet goldfish1_lastal.preNet zebrafish.size goldfish1.size goldfish1_lastal.refTarget.net goldfish1_lastal.chainNet
netToAxt goldfish1_lastal.refTarget.net goldfish1_lastal.preNet zebrafish.2bit goldfish1.2bit stdout | axtSort stdin goldfish1_lastal.axt
axtToMaf goldfish1_lastal.axt zebrafish.size goldfish1.size goldfish1_lastal_final.maf -qPrefix=query. -tPrefix=col.
perl lastFinalToSplit.pl goldfish1_lastal_final.maf > goldfish1_lastal_final_forsplit.maf
cat goldfish1_lastal.maf | last-split > goldfish1_lastal_split.maf
cat goldfish1_lastal_final_forsplit.maf | last-split > goldfish1_lastal_final_split.maf
python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal.maf| samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal.bam
python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal_final.maf| sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal_final.bam
python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal_split.maf | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal_split.bam
python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal_final_split.maf | sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal_final_split.bam
samtools index goldfish1_lastal.bam
samtools index goldfish1_lastal_final.bam
samtools index goldfish1_lastal_split.bam
samtools index goldfish1_lastal_final_split.bam
samtools depth goldfish1_lastal.bam | wc -l # 556566186
samtools depth goldfish1_lastal_split.bam | wc -l # 175401405
samtools depth goldfish1_lastal_final.bam | wc -l # 506859989
samtools depth goldfish1_lastal_final_split.bam | wc -l # 207853751
samtools depth goldfish1_lastal.bam | awk '$3>0{print $0}' | wc -l # 552251259
samtools depth goldfish1_lastal_split.bam | awk '$3>0{print $0}' | wc -l # 171964354
samtools depth goldfish1_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 498521800
samtools depth goldfish1_lastal_final_split.bam | awk '$3>0{print $0}'| wc -l # 204194727
lastdb -P0 zebrafish Danio_rerio.GRCz11.dna.primary_assembly.fa
faToTwoBit Danio_rerio.GRCz11.dna.primary_assembly.fa zebrafish.2bit
faSize -detailed Danio_rerio.GRCz11.dna.primary_assembly.fa > zebrafish.size
lastal zebrafish goldfish2.fa > goldfish2_lastal.maf
faSize -detailed goldfish2.fa > goldfish2.size
faToTwoBit goldfish2.fa goldfish2.2bit
python2 /programs/last-932/bin/maf-convert psl goldfish2_lastal.maf > goldfish2_lastal.psl
axtChain -linearGap=loose -psl goldfish2_lastal.psl -faQ -faT Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa goldfish2_lastal.chain
chainMergeSort goldfish2_lastal.chain > goldfish2_lastal.all.chain
chainPreNet goldfish2_lastal.all.chain zebrafish.size goldfish2.size goldfish2_lastal.preNet
chainNet goldfish2_lastal.preNet zebrafish.size goldfish2.size goldfish2_lastal.refTarget.net goldfish2_lastal.chainNet
netToAxt goldfish2_lastal.refTarget.net goldfish2_lastal.preNet zebrafish.2bit goldfish2.2bit stdout | axtSort stdin goldfish2_lastal.axt
axtToMaf goldfish2_lastal.axt zebrafish.size goldfish2.size goldfish2_lastal_final.maf -qPrefix=query. -tPrefix=col.
perl lastFinalToSplit.pl goldfish2_lastal_final.maf > goldfish2_lastal_final_forsplit.maf
cat goldfish2_lastal.maf | last-split > goldfish2_lastal_split.maf
cat goldfish2_lastal_final_forsplit.maf | last-split > goldfish2_lastal_final_split.maf
python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal.maf| samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal.bam
python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal_final.maf| sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal_final.bam
python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal_split.maf | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal_split.bam
python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal_final_split.maf | sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal_final_split.bam
samtools index goldfish2_lastal.bam
samtools index goldfish2_lastal_final.bam
samtools index goldfish2_lastal_split.bam
samtools index goldfish2_lastal_final_split.bam
samtools depth goldfish2_lastal.bam | wc -l # 512932320
samtools depth goldfish2_lastal_split.bam | wc -l # 145622396
samtools depth goldfish2_lastal_final.bam | wc -l # 467063301
samtools depth goldfish2_lastal_final_split.bam | wc -l # 173644026
samtools depth goldfish2_lastal.bam | awk '$3>0{print $0}' | wc -l # 509004222
samtools depth goldfish2_lastal_split.bam | awk '$3>0{print $0}' | wc -l # 142626377
samtools depth goldfish2_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 459369555
samtools depth goldfish2_lastal_final_split.bam | awk '$3>0{print $0}'| wc -l # 170487273
samtools merge goldfish_lastal_seperate.bam goldfish1_lastal.bam goldfish2_lastal.bam
samtools sort goldfish_lastal_seperate.bam > goldfish_lastal_seperate_sort.bam
samtools depth goldfish_lastal_seperate_sort.bam | wc -l # 628324746
samtools depth goldfish_lastal_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 624893762
samtools merge goldfish_lastal_split_seperate.bam goldfish1_lastal_split.bam goldfish2_lastal_split.bam
samtools sort goldfish_lastal_split_seperate.bam > goldfish_lastal_split_seperate_sort.bam
samtools depth goldfish_lastal_split_seperate_sort.bam | wc -l # 210589780
samtools depth goldfish_lastal_split_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 207926386
samtools merge goldfish_lastal_final_seperate.bam goldfish1_lastal_final.bam goldfish2_lastal_final.bam
samtools sort goldfish_lastal_final_seperate.bam > goldfish_lastal_final_seperate_sort.bam
<!-- samtools depth goldfish_lastal_final_seperate_sort.bam | wc -l # 592717560 -->
samtools depth goldfish_lastal_final_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 587452812
samtools merge goldfish_lastal_final_split_seperate.bam goldfish1_lastal_final_split.bam goldfish2_lastal_final_split.bam
samtools sort goldfish_lastal_final_split_seperate.bam > goldfish_lastal_final_split_seperate_sort.bam
samtools depth goldfish_lastal_final_split_seperate_sort.bam | wc -l # 264115166
samtools depth goldfish_lastal_final_split_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 261043159
nucmer -t 75 --sam-long=mumer.goldfish1.short.sam Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa
cat mumer.goldfish1.short.sam | grep -v "@" | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > mumer.goldfish1.short.bam
samtools index mumer.goldfish1.short.bam #
samtools depth mumer.goldfish1.short.bam | wc -l # 76143143
samtools depth mumer.goldfish1.short.bam | awk '$3>0{print $0}' | wc -l # 75031298
nucmer -t 75 --sam-long=mumer.goldfish2.short.sam Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa
cat mumer.goldfish2.short.sam | grep -v "@" | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > mumer.goldfish2.short.bam
samtools index mumer.goldfish2.short.bam #
samtools depth mumer.goldfish2.short.bam | wc -l # 60735316
samtools depth mumer.goldfish2.short.bam | awk '$3>0{print $0}' | wc -l # 59835463
samtools merge mumer.goldfish.short_seperate.bam mumer.goldfish1.short.bam mumer.goldfish2.short.bam
samtools sort mumer.goldfish.short_seperate.bam > mumer.goldfish.short_seperate_sort.bam
samtools depth mumer.goldfish.short_seperate_sort.bam | wc -l # 94867701
samtools depth mumer.goldfish.short_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 93794614
GSAlign -r Danio_rerio.GRCz11.dna.primary_assembly.fa -q goldfish1.fa -t 18 -o goldfish1_gsalign -fmt 1
python2 /programs/last-932/bin/maf-convert sam goldfish1_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_gsalign.bam
samtools index goldfish1_gsalign.bam
samtools depth goldfish1_gsalign.bam | wc -l # 13165258
samtools depth goldfish1_gsalign.bam | awk '$3>0{print $0}' | wc -l # 12830191
GSAlign -r Danio_rerio.GRCz11.dna.primary_assembly.fa -q goldfish2.fa -t 70 -o goldfish2_gsalign -fmt 1
python2 /programs/last-932/bin/maf-convert sam goldfish2_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_gsalign.bam
samtools index goldfish2_gsalign.bam
samtools depth goldfish2_gsalign.bam | wc -l # 9909623
samtools depth goldfish2_gsalign.bam | awk '$3>0{print $0}' | wc -l # 9638825
samtools merge goldfish2_gsalign_seperate.bam goldfish1_gsalign.bam goldfish2_gsalign.bam
samtools sort goldfish2_gsalign_seperate.bam >goldfish2_gsalign_seperate_sort.bam
samtools depth goldfish2_gsalign_seperate_sort.bam | wc -l # 20038773
samtools depth goldfish2_gsalign_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 19553866
genomesize = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
genomesize = sum(genomesize$V2)
library(ggplot2)
dat = read.table("goldFishSummary2", sep="\t", header=TRUE)
dat$category = factor(dat$category, levels=c("unaligned", "gap", "position match"))
dat$approach <- factor(dat$approach, levels = c("AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20", "LAST+many-to-many", "LAST+many-to-one", "LAST+one-to-one", "MUMmer4", "GSAlign"))
dat$proportion = dat$proportion/genomesize
p = ggplot(data=dat, aes(x=approach, y=proportion, fill=category)) + geom_bar(stat="identity") + scale_fill_manual(values=c("#55B0E4", "#FFF800", "#FFBD00")) + guides(fill=guide_legend(nrow=1,byrow=TRUE))+
labs(x="", y="Genome porportion", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
legend.position = "top",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "genome_aligned2"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
d = read.table("goldfish.fa.fai")
sum(d$V2)/1000/1000/1000 # 1.273912
## [1] 1.273912
d = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
d = head(d, 25)
sum(d$V2)/1000/1000/1000 # 1.345102
## [1] 1.345102